'India Broadband' is a well established company in the broadband space. Due to immense competition, they are facing a major problem of customer churn and dissatisfaction due to broadband outages. The company has curated a dataset, where it tracks several variables that it believes impact the 'outage_duration'. They have tracked three different 'outage durations': '0' for no outage, '1' for short outages that last between a few minutes and a maximum of 2 hours, and '2' for long outages that can last from 2 hours to sometimes even a couple of days. We use the metrics that the company has tracked to create a machine learning model that will be able to predict the 'outage_duration' using the quantitative and qualitative features.
Target attribute: "outage_duration", where: 0=no outage, 1=short outages (between a few minutes and a maximum of 2 hours), 2=long outages (from 2 hours to sometimes even a couple of days)
This is a classification problem (3 classes = 0,1,2)
The evaluation metric used is 'F1 Macro Average'
from google.colab import drive
drive.mount('/gdrive')
import os
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from sklearn import preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, RandomizedSearchCV
from sklearn import tree
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.datasets import make_classification
from sklearn import metrics
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
from matplotlib import pyplot
import seaborn as sns
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot, plot
from scipy.stats.mstats import mode
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 4
def accuracy_precision_recall_f1_metrics(y_true, y_pred):
y_test_scoring = convert_for_sklearn(y_true)
test_pred_scoring = convert_for_sklearn(y_pred)
f1 = f1_score(y_true = ytest_scoring, y_pred = test_pred_scoring, average ='macro')
print("Test F1 Score:",f1)
print(os.listdir('../input'))
train_data_1 = pd.read_csv("../input/train_data.csv")
broadband_1 = pd.read_csv("../input/broadband_data.csv")
outage_1 = pd.read_csv("../input/outage_data.csv")
report_1 = pd.read_csv("../input/report_data.csv")
server_1 = pd.read_csv("../input/server_data.csv")
test_data_1 = pd.read_csv("../input/test_data-1593798292529.csv")
train_data_1 = pd.read_csv("train_data.csv")
broadband_1 = pd.read_csv("broadband_data.csv")
outage_1 = pd.read_csv("outage_data.csv")
report_1 = pd.read_csv("report_data.csv")
server_1 = pd.read_csv("server_data.csv")
sample = pd.read_csv("sample_submission.csv")
test_data_1 = pd.read_csv("test_data-1593798292529.csv")
train_data_1.head(2)
test_data_1.head(2)
print(train_data_1.shape)
print(broadband_1.shape)
print(outage_1.shape)
print(report_1.shape)
print(server_1.shape)
print(sample.shape)
print(test_data_1.shape)
Checking the unique values of 'id' column in all the datasets to understand duplicate values and thereby help in understanding the basis of merging the datasets.
print(len(train_data_1.id.unique()))
print(len(broadband_1.id.unique()))
print(len(outage_1.id.unique()))
print(len(report_1.id.unique()))
print(len(server_1.id.unique()))
print(len(test_data_1.id.unique()))
We understand that duplicate values for 'id' column exist in broadband, report, and server datasets.
On the basis of id column, merging the datasets after checking and dropping the duplicates for each of the datasets:
Checking duplicates in outage
outage_1.duplicated("id").sum()
train_data_1['g'] = train_data_1.groupby('id').cumcount()
outage_1['g'] = outage_1.groupby('id').cumcount()
result = pd.merge(train_data_1,outage_1,on=["id", 'g'],how='left').drop('g', axis=1)
print (result)
test_data_1['g'] = test_data_1.groupby('id').cumcount()
outage_1['g'] = outage_1.groupby('id').cumcount()
test1 = pd.merge(test_data_1,outage_1,on=["id", 'g'],how='left').drop('g', axis=1)
print (test1)
result.head(2)
test1.head(2)
Checking shape of broadband
broadband_1.shape
Checking duplicates of broadband
broadband_1.duplicated("id").sum()
result['g'] = result.groupby('id').cumcount()
broadband_1['g'] = broadband_1.groupby('id').cumcount()
result2 = pd.merge(result,broadband_1,on=["id", 'g'],how='left').drop('g', axis=1)
print (result2)
test1['g'] = test1.groupby('id').cumcount()
broadband_1['g'] = broadband_1.groupby('id').cumcount()
test2 = pd.merge(test1,broadband_1,on=["id", 'g'],how='left').drop('g', axis=1)
print (test2)
result2.head(2)
test2.head(2)
Checking shape of report
report_1.shape
Checking duplicates of report
report_1.duplicated("id").sum()
result2['g'] = result2.groupby('id').cumcount()
report_1['g'] = report_1.groupby('id').cumcount()
result3 = pd.merge(result2,report_1,on=["id", 'g'],how='left').drop('g', axis=1)
print (result3)
test2['g'] = test2.groupby('id').cumcount()
report_1['g'] = report_1.groupby('id').cumcount()
test3 = pd.merge(test2,report_1,on=["id", 'g'],how='left').drop('g', axis=1)
print (test3)
result3.head(2)
test3.head(2)
Checking shape of server
server_1.shape
Checking duplicates of server
server_1.duplicated("id").sum()
result3['g'] = result3.groupby('id').cumcount()
server_1['g'] = server_1.groupby('id').cumcount()
train_1 = pd.merge(result3,server_1,on=["id", 'g'],how='left').drop('g', axis=1)
print (train_1)
test3['g'] = test3.groupby('id').cumcount()
server_1['g'] = server_1.groupby('id').cumcount()
test_1 = pd.merge(test3,server_1,on=["id", 'g'],how='left').drop('g', axis=1)
print (test_1)
train_1.head(2)
test_1.head(2)
train_1.shape, test_1.shape
sample.shape
#!pip install pandas-profiling[notebook,html]
import pandas_profiling
pandas_profiling.ProfileReport(train_1)
We get a detailed report and insights on the entire dataset including the features - categorical & numerical, missing values, correlation between different numerical features, unique/distinct values, and a sample of the dataset (first few & last few rows).
train_1.dtypes
train_1.describe()
train_1.describe(include='object')
train_1.info()
train_1.isnull().sum()
There are no missing values in the train data.
for col in ['outage_type','broadband_type','log_report_type','transit_server_type','area_code']:
train_1[col] = train_1[col].astype('category')
cat_attr = list(train_1.select_dtypes("category").columns)
num_attr = list(train_1.columns.difference(cat_attr))
num_attr
cat_attr
train_1.dtypes
sns.countplot(x='outage_duration',data = train_1)
plt.show()
train_1.outage_duration.value_counts(normalize = True)*100
As per the above visualization, we understand that 64% of outage duration has no outage issues, 25% has short outage issues and 9.8% (as the least) has long outage issues.
def generate_layout_bar(col_name):
layout_bar = go.Layout(
autosize=False, # auto size the graph? use False if you are specifying the height and width
width=800, # height of the figure in pixels
height=600, # height of the figure in pixels
title = "Distribution of {} column".format(col_name), # title of the figure
# more granular control on the title font
titlefont=dict(
family='Courier New, monospace', # font family
size=14, # size of the font
color='black' # color of the font
),
# granular control on the axes objects
xaxis=dict(
tickfont=dict(
family='Courier New, monospace', # font family
size=14, # size of ticks displayed on the x axis
color='black' # color of the font
)
),
yaxis=dict(
title='Percentage',
titlefont=dict(
size=14,
color='black'
),
tickfont=dict(
family='Courier New, monospace', # font family
size=14, # size of ticks displayed on the y axis
color='black' # color of the font
)
),
font = dict(
family='Courier New, monospace', # font family
color = "white",# color of the font
size = 12 # size of the font displayed on the bar
)
)
return layout_bar
def plot_bar(col_name):
# create a table with value counts
temp = train_1[col_name].value_counts()
# creating a Bar chart object of plotly
data = [go.Bar(
x=temp.index.astype(str), # x axis values
y=np.round(temp.values.astype(float)/temp.values.sum(),4)*100, # y axis values
text = ['{}%'.format(i) for i in np.round(temp.values.astype(float)/temp.values.sum(),4)*100],
# text to be displayed on the bar, we are doing this to display the '%' symbol along with the number on the bar
textposition = 'auto', # specify at which position on the bar the text should appear
marker = dict(color = '#0047AB'),)] # change color of the bar
# color used here Cobalt Blue
layout_bar = generate_layout_bar(col_name=col_name)
fig = go.Figure(data=data, layout=layout_bar)
return iplot(fig)
plot_bar('area_code')
plot_bar('broadband_type')
plot_bar('outage_type')
plot_bar('log_report_type')
plot_bar('transit_server_type')
def summary(x):
x_min = train_1[x].min()
x_max = train_1[x].max()
Q1 = train_1[x].quantile(0.25)
Q2 = train_1[x].quantile(0.50)
Q3 = train_1[x].quantile(0.75)
print(f'5 Point Summary of {x.capitalize()} Attribute:\n'
f'{x.capitalize()}(min) : {x_min}\n'
f'Q1 : {Q1}\n'
f'Q2(Median) : {Q2}\n'
f'Q3 : {Q3}\n'
f'{x.capitalize()}(max) : {x_max}')
fig = plt.figure(figsize=(16, 10))
plt.subplots_adjust(hspace = 0.6)
sns.set_palette('pastel')
plt.subplot(221)
ax1 = sns.distplot(train_1[x], color = 'r')
plt.title(f'{x.capitalize()} Density Distribution')
plt.subplot(222)
ax2 = sns.violinplot(x = train_1[x], palette = 'Accent', split = True)
plt.title(f'{x.capitalize()} Violinplot')
plt.subplot(223)
ax2 = sns.boxplot(x=train_1[x], palette = 'cool', width=0.5, linewidth=0.6)
plt.title(f'{x.capitalize()} Boxplot')
plt.subplot(224)
ax3 = sns.kdeplot(train_1[x], cumulative=True)
plt.title(f'{x.capitalize()} Cumulative Density Distribution')
plt.show()
def box_plot(x = 'bmi'):
def add_values(bp, ax):
""" This actually adds the numbers to the various points of the boxplots"""
for element in ['whiskers', 'medians', 'caps']:
for line in bp[element]:
# Get the position of the element. y is the label you want
(x_l, y),(x_r, _) = line.get_xydata()
# Make sure datapoints exist
# (I've been working with intervals, should not be problem for this case)
if not np.isnan(y):
x_line_center = x_l + (x_r - x_l)/2
y_line_center = y # Since it's a line and it's horisontal
# overlay the value: on the line, from center to right
ax.text(x_line_center, y_line_center, # Position
'%.2f' % y, # Value (3f = 3 decimal float)
verticalalignment='center', # Centered vertically with line
fontsize=12, backgroundcolor="white")
fig, axes = plt.subplots(1, figsize=(4, 8))
red_diamond = dict(markerfacecolor='r', marker='D')
bp_dict = train_1.boxplot(column = x,
grid=True,
figsize=(4, 8),
ax=axes,
vert = True,
notch=False,
widths = 0.5,
showmeans = True,
whis = 1.5,
flierprops = red_diamond,
boxprops= dict(linewidth=3.0, color='black'),
whiskerprops=dict(linewidth=3.0, color='black'),
return_type = 'dict')
add_values(bp_dict, axes)
plt.title(f'{x.capitalize()} Boxplot', fontsize=16)
plt.ylabel(f'{x.capitalize()}', fontsize=14)
plt.show()
skew = train_1[x].skew()
Q1 = train_1[x].quantile(0.25)
Q3 = train_1[x].quantile(0.75)
IQR = Q3 - Q1
total_outlier_num = ((train_1[x] < (Q1 - 1.5 * IQR)) | (train_1[x] > (Q3 + 1.5 * IQR))).sum()
print(f'Mean {x.capitalize()} = {train_1[x].mean()}')
print(f'Median {x.capitalize()} = {train_1[x].median()}')
print(f'Skewness of {x}: {skew}.')
print(f'Total number of outliers in {x} distribution: {total_outlier_num}.')
summary('volume')
box_plot('volume')
The box plot helps us identify outliers in distribution and measure variablity in the dataset.
Understanding the relation between outage_duration and other categorical independent variables
(w.r.t. outage_duration == 2) being the highest outage duration
count = train_1[train_1.outage_duration == 2].area_code.value_counts()
count.head(10).plot(kind="bar", figsize =(10,10))
count.head(10)
=> On the basis of the EDA, the areas most prone to long outage durations are:
area_1100 = 28,
area_1107 = 27,
area_600 = 27,
area_821 = 24,
area_734 = 23..
count = train_1[train_1.outage_duration == 2].broadband_type.value_counts()
count.head(10).plot(kind="bar", figsize =(10,10))
count.head(10)
=> On the basis of the visualization, the broadband types most suspected of long outage durations are:
broadband_type_8 'ADSL 1' = 498,
broadband_type_2 'ADSL 2' = 62,
broadband_type_6 'ADSL 2+' = 8,
broadband_type_1 'Fiber Ultra' = 5,
broadband_type_7 'Cable' = 4 ..
count = train_1[train_1.outage_duration == 2].outage_type.value_counts()
count.head(10).plot(kind="bar", figsize =(5,5))
count.head(10)
count = train_1[train_1.outage_duration == 2].log_report_type.value_counts()
count.head(10).plot(kind="bar", figsize =(5,5))
count.head(10)
count = train_1[train_1.outage_duration == 2].transit_server_type.value_counts()
count.head(10).plot(kind="bar", figsize =(5,5))
count.head(10)
On the basis of the above plots, the client would have to focus outage control solutions in:
(1) area_1100, (2) broadband_type_8 'ADSL 1', (3) outage_type_1, (4) log_report_type_82, and (5) transit_server_type_15
as they face maximum outage issues varying from 2 hours to couple of days.
g = sns.FacetGrid(train_1, col='outage_duration')
g.map(plt.hist, 'volume', bins=20)
As per the above visualization, majority composition of volume lies with outage_duration = 0 category, relatively less with outage_duration = 1, and the least with outage_duration = 2.
corrmat = train_1.corr()
plt.subplots(figsize=(10, 5))
sns.heatmap(corrmat, vmax=.9, square=True)
The correlation matrix only shows relation between the numerical attributes (id, volume and outage_duration). To see the complete correlation matrix we would have to convert the categorical columns to integers using label encoding or dummification and then visualize the correlation matrix again.
train_1.dtypes
train_1.isnull().sum()
train_1 = pd.get_dummies(columns=['outage_type','broadband_type'], data=train_1)
test_1 = pd.get_dummies(columns=['outage_type','broadband_type'], data = test_1)
train_1.head(2)
test_1.head(2)
cat_attr = list(train_1.select_dtypes("category").columns)
num_attr = list(train_1.columns.difference(cat_attr))
num_attr
train_1.drop(["outage_duration"], axis = 1, inplace = True)
train_1.shape, test_1.shape
corrmat = train_1.corr()
plt.subplots(figsize=(20, 10))
sns.heatmap(corrmat, annot=True, cmap = 'summer_r')
From the correlation plot, we can conclude that broadband_type_8 shows a weak correlation with broadband_type_2, and broadband_type_9 shows a strong correlation with outage_type_5.
1) Map the outage_types correlating to the high outage duration against id column to detect outage durations. In turn, build on improving the specific outage type to improve outage issues.
2) Map the transit server types with the highest outage durations against the id column to detect what kind of servers are facing high outage durations and implement corrective actions for it.
3) Using GridWatch, the sudden drops of WiFi signals, ambient lights, and charging patterns of users penetrating across the city can be monitored.
(Research article reference : https://noahklugman.com/papers/deployment_smartgridcomm.pdf)
train_data = pd.read_csv("../input/train_data.csv")
broadband = pd.read_csv("../input/broadband_data.csv")
outage = pd.read_csv("../input/outage_data.csv")
report = pd.read_csv("../input/report_data.csv")
server = pd.read_csv("../input/server_data.csv")
test_data = pd.read_csv("../input/test_data-1593798292529.csv")
train_data = pd.read_csv("train_data.csv")
broadband = pd.read_csv("broadband_data.csv")
outage = pd.read_csv("outage_data.csv")
report = pd.read_csv("report_data.csv")
server = pd.read_csv("server_data.csv")
sample = pd.read_csv("sample_submission.csv")
test_data = pd.read_csv("test_data-1593798292529.csv")
train_data.head(2)
test_data.head(2)
print(train_data.shape)
print(broadband.shape)
print(outage.shape)
print(report.shape)
print(server.shape)
print(sample.shape)
print(test_data.shape)
Checking the unique values of 'id' column in all the datasets to understand duplicate values and thereby help in understanding the basis of merging the datasets.
print(len(train_data.id.unique()))
print(len(broadband.id.unique()))
print(len(outage.id.unique()))
print(len(report.id.unique()))
print(len(server.id.unique()))
print(len(test_data.id.unique()))
We understand that duplicate values for 'id' column exist in broadband, report, and server datasets.
Instead of dropping duplicate id values, we merge the files using pivot table function to avoid loss of data.
train_data['origin'] = 'train'
test_data['origin'] = 'test'
data_temp = pd.concat([train_data,test_data], ignore_index=True)
print(data_temp.head())
print(data_temp.tail())
print(train_data.shape)
print(test_data.shape)
print(data_temp.shape)
print(broadband.shape)
print(outage.shape)
print(report.shape)
print(server.shape)
print(outage.head(2))
print(report.head(2))
print(server.head(2))
data_columns=["id","area_code","origin","outage_duration"]
data_columns
data=data_temp[data_columns]
data.head(2)
data.tail(2)
print(broadband.head(2))
broadband = broadband.merge(data, on='id', how='left')
broadband.head(2)
broadband_unq = pd.DataFrame(broadband['broadband_type'].value_counts())
broadband_unq.head()
Determine % of training samples:
broadband_unq['PercTrain'] = broadband.pivot_table(values='origin',index='broadband_type',aggfunc=lambda x: sum(x=='train')/float(len(x)))
broadband_unq.head(10)
Determine the mode of each:
broadband_unq['Mode_outage_duration'] = broadband.loc[broadband['origin']=='train'].pivot_table(values='outage_duration',index='broadband_type', aggfunc=lambda x: mode(x).mode[0])
broadband.loc[broadband['broadband_type']=='broadband_2']
broadband_merge = broadband.pivot_table(values='origin',index='id',columns='broadband_type',aggfunc=lambda x: len(x),fill_value=0)
broadband_merge.head(2)
broadband_merge.shape
broadband_data = data.merge(broadband_merge, left_on='id', right_index=True)
broadband_data.head(2)
print(broadband_data.shape)
broadband_temp=broadband_data.copy()
outage['outage_type'].value_counts().head()
outage = outage.merge(data, on='id', how='left')
outage.head(2)
outage_unq = pd.DataFrame(outage['outage_type'].value_counts())
outage_unq.head()
outage_unq['PercTrain'] = outage.pivot_table(values='origin',index='outage_type',aggfunc=lambda x: sum(x=='train')/float(len(x)))
outage_unq.head()
Determine the mode of each:
outage_unq['Mode_outage_duration'] = outage.loc[outage['origin']=='train'].pivot_table(values='outage_duration',index='outage_type', aggfunc=lambda x: mode(x).mode[0])
Merge preprocess into original and then into train:
outage_merge = outage.pivot_table(values='origin',index='id',columns='outage_type', aggfunc=lambda x:len(x), fill_value=0)
outage_merge.head(2)
outage_merge.shape
broadband_outage = broadband_data.merge(outage_merge, left_on='id', right_index=True)
broadband_outage.head(2)
broadband_outage.shape
report['log_report_type'].value_counts()
report = report.merge(data,on='id',how='left')
report.head(2)
report_unq = pd.DataFrame(report['log_report_type'].value_counts())
report_unq.head()
Determine % of training samples:
report_unq['PercTrain'] = report.pivot_table(values='origin',index='log_report_type',aggfunc=lambda x: sum(x=='train')/float(len(x)))
report_unq.head(10)
Determine the mode of each:
report_unq['Mode_outage_duration'] = report.loc[report['origin']=='train'].pivot_table(values='outage_duration',index='log_report_type', aggfunc=lambda x: mode(x).mode[0])
len(report_unq)
Initializing:
report_unq['preprocess'] = report_unq.index.values
Removing the ones in train
report_unq['preprocess'].loc[report_unq['PercTrain']==1] = np.nan
report_unq[-100:]
report_unq[:3]
limit=128
report_unq['preprocess'].iloc[limit:]=report_unq['Mode_outage_duration'].iloc[limit:].apply(lambda x: 'Remove' if pd.isnull(x) else 'log_report others_%d'%int(x))
print (report_unq['preprocess'].value_counts())
report_unq
report_data = report.merge(report_unq[['preprocess']], left_on='log_report_type',right_index=True)
print(report_data.head())
report_data['preprocess'].value_counts()
report_merge = report_data.pivot_table(values='volume',index='id',columns='preprocess',aggfunc=lambda x: len(x),fill_value=0)
broadband_outage_report = broadband_outage.merge(report_merge, left_on='id', right_index=True)
broadband_outage_report.head(2)
broadband_outage_report.shape
server['transit_server_type'].value_counts().head()
server = server.merge(data,on='id', how='left')
server.head(2)
server_unq = pd.DataFrame(server['transit_server_type'].value_counts())
server_unq.head()
Determine % of training samples:
server_unq['PercTrain'] = server.pivot_table(values='origin',index='transit_server_type',aggfunc=lambda x: sum(x=='train')/float(len(x)))
server_unq.head()
Determine the mode of each
server_unq['Mode_outage_duration'] = server.loc[server['origin']=='train'].pivot_table(values='outage_duration',index='transit_server_type', aggfunc=lambda x: mode(x).mode[0])
server_unq.iloc[-20:]
server_unq.nunique()
Initializing:
server_unq['preprocess'] = server_unq.index.values
Removing the ones in train
server_unq['preprocess'].loc[server_unq['PercTrain']==0] = 'Remove'
Replacing the lower ones with mode:
top_unchange = 33
server_unq['preprocess'].iloc[top_unchange:] = server_unq['Mode_outage_duration'].iloc[top_unchange:].apply(lambda x: 'Remove' if pd.isnull(x) else 'transit_server others_%d'%int(x))
server_unq['preprocess'].iloc[33:]
server_data = server.merge(server_unq[['preprocess']], left_on='transit_server_type',right_index=True)
print(server_data.head(2))
server_data['preprocess'].value_counts()
server_merge = server_data.pivot_table(values='origin',index='id',columns='preprocess',aggfunc=lambda x: len(x),fill_value=0)
final_data = broadband_outage_report.merge(server_merge, left_on='id', right_index=True)
final_data.head(2)
final_data.shape
[x for x in data.columns if 'Remove' in x]
final_data.drop(['Remove_x','Remove_y'],axis=1,inplace=True)
train = final_data.loc[final_data['origin']=='train']
test = final_data.loc[final_data['origin']=='test']
train.drop('origin',axis=1,inplace=True)
test.drop(['origin','outage_duration'],axis=1,inplace=True)
train.dtypes
train.head()
train.tail()
train.shape, test.shape
train.dtypes
train.describe()
train.describe(include='object')
train.info()
train.isnull().sum()
for col in ['area_code']:
train[col]=train[col].astype('category')
for col in ['area_code']:
test[col] = test[col].astype('category')
cat_attr = list(train.select_dtypes("category").columns)
num_attr = list(train.columns.difference(cat_attr))
num_attr
cat_attr
for col in ['outage_duration']:
train[col] = train[col].astype(int)
train.dtypes
train.isnull().sum()
loc_counter = {}
def loc_count(x):
if x['area_code'] in loc_counter:
loc_counter[x['area_code']]+=1
else:
loc_counter[x['area_code']]=1
return loc_counter[x['area_code']]
train['location_occurence_count'] = train[['id','area_code']].apply(loc_count,axis=1)
test['location_occurence_count'] = test[['id','area_code']].apply(loc_count, axis=1)
locbool = train['area_code']=='area_126'
train.loc[locbool,['id','area_code','location_occurence_count']].head(5)
locbool = test['area_code']=='area_126'
test.loc[locbool,['id','area_code','location_occurence_count']].head()
train.loc[:,['id','area_code','location_occurence_count']].head(100)
test.loc[:,['id','area_code','location_occurence_count']].head(100)
train.head()
test.head()
cat_attr = list(train.select_dtypes("category").columns)
num_attr = list(train.columns.difference(cat_attr))
cat_attr
train.dtypes
num_attr
num_attr.pop(143)
lb = LabelEncoder()
for col in ['area_code']:
train[col] = train[col].astype('str')
for col in ['area_code']:
test[col] = test[col].astype('str')
train['area_code'] = lb.fit_transform(train['area_code'])
test['area_code'] = lb.fit_transform(test['area_code'])
train.shape, test.shape
train.head(2)
test.head(2)
train.isnull().sum()
There are no missing values in the train data.
train.dtypes
X, y = train.loc[:,train.columns!='outage_duration'], train.loc[:,'outage_duration']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify =y,random_state=3)
X_train.shape, X_test.shape, y_train.shape, y_test.shape
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
train_pred = logreg.predict(X_train)
test_pred = logreg.predict(X_test)
print('Confusion matrix \n')
print(metrics.confusion_matrix(y_test,test_pred))
print('*'*80)
print('\n')
print('Classification report \n')
print(metrics.classification_report(y_train, train_pred))
print(metrics.classification_report(y_test,test_pred))
model1_log = logreg.predict(test)
model1_log
final= pd.DataFrame({'ID' : test['id'],
'Outage_Duration' : model1_log})
final.to_csv("submission_log_1.csv",index=False)
rfc = RandomForestClassifier()
rfc.fit(X_train,y_train)
train_pred = rfc.predict(X_train)
test_pred = rfc.predict(X_test)
print('Confusion matrix \n')
print(metrics.confusion_matrix(y_test,test_pred))
print('*'*80)
print('\n')
print('Classification report \n')
print(metrics.classification_report(y_train, train_pred))
print(metrics.classification_report(y_test,test_pred))
model1_rf = rfc.predict(test)
model1_rf
final2= pd.DataFrame({'ID' : test['id'],
'Outage_Duration' : model1_rf})
final2.to_csv("submission_rf_1.csv",index=False)
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
folds = KFold(n_splits= 5, shuffle= True, random_state= 101)
params = {'max_depth':[1, 2, 3, 5],
'n_estimators':[500,550,600,650],
'min_samples_leaf': [100, 150, 200, 250],
'min_samples_split': [150, 200, 250, 300],
'class_weight':['balanced']}
rf = RandomForestClassifier()
rf_fin = GridSearchCV(estimator= rf, cv = folds, param_grid= params, scoring= 'f1_macro', return_train_score= True)
rf_fin.fit(X_train,y_train)
scores = rf_fin.cv_results_
scores = pd.DataFrame(scores)
scores.head()
print('The best score was achieved using the parameters: {}'.format(rf_fin.best_params_))
random_final = RandomForestClassifier(max_depth= 5,
min_samples_leaf= 100,
min_samples_split= 150,
n_estimators= 500)
random_final.fit(X_train,y_train)
train_pred = random_final.predict(X_train)
test_pred = random_final.predict(X_test)
print('Confusion matrix \n')
print(metrics.confusion_matrix(y_test,test_pred))
print('*'*80)
print('\n')
print('Classification report \n')
print(metrics.classification_report(y_train, train_pred))
print(metrics.classification_report(y_test,test_pred))
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
dt.get_params().keys()
dt_params = {'criterion': ['entropy', 'gini'],
'max_depth': [6,8,9,10],
'min_samples_split': [2, 3, 4],
'min_samples_leaf': [1, 2, 3],
'class_weight':['balanced']}
dt_grid = GridSearchCV(dt, param_grid=dt_params, cv=10, scoring = 'f1_macro')
%%time
dt_grid.fit(X_train,y_train)
dt_grid.best_params_
train_pred = dt_grid.predict(X_train)
test_pred = dt_grid.predict(X_test)
print(dt_grid.score(X_train, y_train))
print(dt_grid.score(X_test, y_test))
print("\n")
print(confusion_matrix(y_true=y_train, y_pred = train_pred))
confusion_matrix_test = confusion_matrix(y_true=y_test, y_pred = test_pred)
confusion_matrix_test
print('Classification report \n')
print(metrics.classification_report(y_train, train_pred))
print(metrics.classification_report(y_test,test_pred))
model3_dt = dt_grid.predict(test)
model3_dt
final3= pd.DataFrame({'ID' : test['id'],
'Outage_Duration' : model3_dt})
final3.to_csv("submission_dt.csv",index=False)
gbm = GradientBoostingClassifier()
gbm.fit(X_train, y_train)
gbm_params = {'max_depth': [4,5,6],
'subsample': [0.8, 0.3],
'max_features':[0.4,0.5],
'n_estimators': [40,50]}
gbm_grid = GridSearchCV(gbm, param_grid= gbm_params, cv = 10, scoring= 'f1_macro', return_train_score= True)
%%time
gbm_grid.fit(X_train,y_train)
gbm_grid.best_params_
train_pred = gbm_grid.predict(X_train)
test_pred = gbm_grid.predict(X_test)
print(gbm_grid.score(X_train, y_train))
print(gbm_grid.score(X_test, y_test))
print("\n")
print(confusion_matrix(y_true=y_train, y_pred = train_pred))
confusion_matrix_test = confusion_matrix(y_true=y_test, y_pred = test_pred)
confusion_matrix_test
print('Classification report \n')
print(metrics.classification_report(y_train, train_pred))
print(metrics.classification_report(y_test,test_pred))
model4_gbm = gbm_grid.predict(test)
model4_gbm
final4= pd.DataFrame({'ID' : test['id'],
'Outage_Duration' : model4_gbm})
final4.to_csv("submission_gbm.csv",index=False)
xgb = XGBClassifier()
xgb.fit(X_train, y_train)
xgb_params = {'n_estimators':[150],
'learning_rate': [0.05,0.10],
'colsample_bytree': [0.5,0.1],
'reg_alpha': [0.5,1.0],
'reg_lambda': [0.5,1.1],
'subsample': [0.5,0.6,0.7],
'max_depth': [6,8,10],}
xgb_grid = GridSearchCV(xgb, param_grid= xgb_params, cv = 10, scoring= 'f1_macro')
%%time
xgb_grid.fit(X_train,y_train)
train_pred = xgb_grid.predict(X_train)
test_pred = xgb_grid.predict(X_test)
print(xgb_grid.score(X_train, y_train))
print(xgb_grid.score(X_test, y_test))
print("\n")
print(confusion_matrix(y_true = y_train, y_pred = train_pred))
confusion_matrix_test = confusion_matrix(y_true=y_test, y_pred = test_pred)
confusion_matrix_test
print('Classification report \n')
print(metrics.classification_report(y_train, train_pred))
print(metrics.classification_report(y_test,test_pred))
test.dtypes
model5_xgb = xgb_grid.predict(test)
model5_xgb
final5= pd.DataFrame({'id' : test['id'],
'outage_duration' : model5_xgb})
final5.to_csv("submission_xgb2.csv",index=False)
dt.feature_importances_
features = train.columns
importances = dt.feature_importances_
indices = np.argsort(importances)[::-1] # np.argsort returns the indices that would sort an array.
pd.DataFrame([train.columns[indices],np.sort(importances)[::-1]])
X, y = make_classification(n_samples=1000, n_features=10, n_informative=5, n_redundant=5, random_state=1)
dt = DecisionTreeClassifier()
dt.fit(X, y)
importance = dt.feature_importances_
for i,v in enumerate(importance):
print('Feature: %0d, Score: %.5f' % (i,v))
pyplot.bar([x for x in range(len(importance))], importance)
pyplot.show()
select = indices[0:186]
dt = tree.DecisionTreeClassifier(max_depth=3)
dt = dt.fit(X_train.iloc[:,select], y_train)
train_pred = dt.predict(X_train.iloc[:,select])
test_pred = dt.predict(X_test.iloc[:,select])
print('Classification report \n')
print(metrics.classification_report(y_train, train_pred))
print(metrics.classification_report(y_test,test_pred))
model6_dt2 = dt.predict(test)
model6_dt2
final6= pd.DataFrame({'ID' : test['id'],
'Outage_Duration' : model6_dt2})
final6.to_csv("submission_dt2.csv",index=False)
sns.distplot((y_test))
FINAL MODEL: As seen above, XG Boost technique gives us the best results with an F1 score on train data as 85.5% and test as 60%.